# -*- coding=utf-8 -*-
import time
import requests
import threading
from multiprocessing import Queue
from lxml import etree
import pandas as pd
import re


class CrawlPage(threading.Thread):
    def __init__(self, page_queue, data_queue, thread_name):
        super(CrawlPage, self).__init__()
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.thread_name = thread_name
        # 默认请求头
        self.header = {
            "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Host": "search.51job.com",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
        }


    def run(self):
        print("线程为：{}".format(self.thread_name))
        # 1.循环取page_queue里的url直到取完 #
        # 2.requests发送请求返回数据保存到data_queue
        while not page_flag:
            try:
                page = self.page_queue.get(block=False)
                page_url = 'https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,{}.html'.format(page)
                response = requests.get(url=page_url, headers=self.header)
                print("爬取的url为：{}".format(page_url), "状态码为：{}".format(response.status_code))
                response.encoding = 'gbk'
                self.data_queue.put(response.text)
            except:
                pass


# 设置最终数据保存位置
info_data = []


class CrawlData(threading.Thread):
    def __init__(self, thread_name, data_queue):
        super(CrawlData, self).__init__()
        self.thread_name = thread_name
        self.data_queue = data_queue

    def run(self):
        print("当前处理数据的线程为：{}".format(self.thread_name))
        while not data_flag:
            try:
                print("当前剩余数据量为{}".format(self.data_queue.qsize()))
                text = self.data_queue.get(block=False)
                html = etree.HTML(text)
                all_div = html.xpath("//div[@id='resultList']//div[@class='el']")
                info_list = []
                for item in all_div:
                    info = {}
                    # 获取数据的时候，使用列表索引为0的数据
                    info['岗位名称'] = item.xpath("./p/span/a/@title")[0]
                    info['公司名称'] = item.xpath(".//span[@class='t2']/a/@title")[0]
                    info['地址'] = item.xpath(".//span[@class='t3']/text()")[0]
                    # money字段可能为空，try-except来进行异常处理
                    try:
                        info['工资'] = item.xpath(".//span[@class='t4']/text()")[0]
                    except:
                        info['工资'] = '无数据'
                    info['发布时间'] = item.xpath(".//span[@class='t5']/text()")[0]
                    if info["工资"] != "无数据":
                        re1 = r'-(.*?)/'
                        original_amount = re.findall(re1, info["工资"])[0]
                        info["最高工资"] = transform(original_amount)
                        info_list.append(info)
                        info_data.append(info)
                    else:
                        pass

                print("线程：{}，数据为：{}".format(self.thread_name, info_list))
                # print(df)
                time.sleep(1)
            except:
                pass


# 设置两个全局标志位，当数据取完时结束while循环
page_flag = False
data_flag = False


# 金额转换
def transform(i):
    if "." in i:
        if "万" in i:
            numberw = re.sub(r'[.万]', '', i)
            numberw = numberw + "000"
            # print("包含万的小数转换：", i, numberw)
            return numberw
        if "千" in i:
            numberq = re.sub(r'[.千]', '', i)
            numberq = numberq + "00"
            # print("包含千：", i, numberq)
            return numberq
    else:
        if "万" in i:
            numberw = re.sub(r'[万]', '', i)
            numberw = numberw + "0000"
            # print("包含万的小数转换：", i, numberw)
            return numberw
        if "千" in i:
            numberq = re.sub(r'[千]', '', i)
            numberq = numberq + "000"
            # print("包含千：", i, numberq)
            return numberq


def main():
    # 构造存放页码和文本数据队列
    page_queue = Queue()
    data_queue = Queue()

    # 存入页码数据
    for page in range(1, 40):
        page_queue.put(page)
    print("当前页码数共为：{}".format(page_queue.qsize()))

    # 启动线程爬取页面信息
    global page_flag
    page_thread_name = ['页面爬取1', '页面爬取2', '页面爬取3']
    page_crawl_list = []
    for page_thread in page_thread_name:
        page_crawl = CrawlPage(page_queue, data_queue, page_thread)
        page_crawl.start()
        page_crawl_list.append(page_crawl)

    # 主线程进行阻塞，直到page_queue里的数据全部取完
    while not page_queue.empty():
        pass

    # 当page_queue里的数据全部取完后，将标志位设置为True，结束CrawlPage.run() 方法中的while循环
    page_flag = True
    for page_crwal_join in page_crawl_list:
        page_crwal_join.join()
        print(page_crwal_join.thread_name + "页面爬取结束！")
    print("当前queue的数据总量为:{}".format(data_queue.qsize()))

    # 设置3个文本处理线程，启动文本处理类
    crawl_thread_name = ["数据处理1", "数据处理2", "数据处理3"]
    crawl_data_list = []
    for crawl_data_name in crawl_thread_name:
        crawl_data = CrawlData(crawl_data_name, data_queue)
        crawl_data.start()
        crawl_data_list.append(crawl_data)

    # 阻塞主线程，直到data_queue的数据全部取完
    while not data_queue.empty():
        pass

    global data_flag
    # 数据取完后退出线程
    data_flag = True
    for crawl_data_join in crawl_data_list:
        crawl_data_join.join()
        print("线程{}结束".format(crawl_data_join.thread_name))

    # 数据为0，解析html文本数据结束
    print("data_queue的数据为：{}".format(data_queue.qsize()))
    # 将json数组转为DataFrame方便pandas写入csv
    df = pd.DataFrame(info_data)
    # 将字符串类型金额转为数字类型
    df['最高工资'] = df['最高工资'].astype('int')
    # 写入csv
    df.to_csv("北京python职位.csv", encoding="utf_8_sig")
    # 打印爬取到的所以信息的平均值
    print("爬取到北京python岗位的平均工资为：%.2f 元/月" % df['最高工资'].mean(0))


if __name__ == '__main__':
    main()
